package com.example.spider.cluster.handler.cctv;

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import com.example.spider.cluster.common.task.CctvNewsDetailTask;
import com.example.spider.common.handler.SpiderResultHandler;
import com.example.spider.common.task.SpiderTask;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;

import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.UUID;
import java.util.regex.Pattern;

/**
 * 处理 cctv 新闻详情
 * @author lym
 */
@Slf4j
@Component
public class CctvNewsDetailResultHandler implements SpiderResultHandler {

    @Override
    public boolean canHandle(SpiderTask task, String result) {
        return task instanceof CctvNewsDetailTask;
    }

    private Pattern pattern = Pattern.compile("<!--repaste.body.begin-->(.*?)<!--repaste.body.end-->", Pattern.DOTALL);

    private Pattern paragraphPattern = Pattern.compile("<p.*?>(.*?)</p>");

    // todo 处理图片?
    private Pattern imagePattern = Pattern.compile("<img.*?src=\"(.*?)\".*?[/>|</img>]");

    private String savePath;

    CctvNewsDetailResultHandler(@Value("${spider.cctvNews.save.path}") String savePath){
        File file = new File(savePath);
        if(!file.exists()){
            FileUtil.mkdir(file);
        }else if(!file.isDirectory()){
            throw new IllegalStateException("illegal settings 'spider.cctvNews.save.path':" + savePath);
        }
        if(!savePath.endsWith("/") && !savePath.endsWith("\\")){
            savePath = savePath + File.separator;
        }
        this.savePath = savePath;
    }

    /**
     * 保存到数据库
     */
    @Override
    public void handle(SpiderTask task, String result) {
        log.info("prepare handing CctvNewsDetail" + task.getUrl());
        // 获取正文
        List<String> contextMatchList = ReUtil.findAll(pattern, result, 1);
        if(CollectionUtils.isEmpty(contextMatchList)){
            // todo 有的新闻只有图片，因此无法匹配到 处理这部分
            log.error("cctvNewsDetail invalid(context match fail). Task:{}", task);
            return;
        }
        String context = contextMatchList.get(0);

        // 获取正文中所有文字段落
        List<String> paragraphMatchList = ReUtil.findAll(paragraphPattern, context, 1);
        if(CollectionUtils.isEmpty(paragraphMatchList)){
            // todo 有的新闻只有图片，因此无法匹配到 处理这部分
            log.error("cctvNewsDetail invalid(paragraph match fail). Task:{}", task);
            return;
        }

        // 将未处理的字符处理 todo 优化：搞成 map 形式 	&divide; ÷ lt < rt >
        for (int i = 0; i < paragraphMatchList.size(); i++) {
            paragraphMatchList.set(i, paragraphMatchList.get(i)
                    // 单引号转义
                    .replaceAll("&lsquo;", "‘")
                    .replaceAll("&rsquo;", "’")
                    // 双引号转义
                    .replaceAll("&ldquo;", "“")
                    .replaceAll("&rdquo;", "”")

                    // 破折号转义
                    .replaceAll("&mdash;", "——")
                    // 空格转义
                    .replaceAll("&nbsp;", " ")
                    // 省略号
                    .replaceAll("&hellip;", "...")
                    .replaceAll("&middot;", "·")
                    .replaceAll("&times;", "×")
                    .replaceAll("&permil;", "‰")

                    // 去掉加粗
                    .replaceAll("<strong>", "")
                    .replaceAll("</strong>", "")

            );
            if(paragraphMatchList.get(i).contains("<im")){
                System.out.println(paragraphMatchList.get(i));
            }
        }

        // 保存到文件
        CctvNewsDetailTask detailTask = (CctvNewsDetailTask)task;
        String title = detailTask.getItem().getTitle();
        String fileName = generateFileName(title);
        log.info("prepare save to file: " + fileName);
        File file = FileUtil.touch(fileName);
        FileUtil.writeLines(paragraphMatchList, file, StandardCharsets.UTF_8);

        log.info("finished handle CctvNewsDetail" + task.getUrl());
    }

    private String generateFileName(String title) {
        if(StrUtil.isBlank(title)){
            return UUID.randomUUID().toString().replace("-", "");
        }
        // 替换的特殊字符 < > / \  :  * ?
        title = title.replaceAll("/", "or")
                .replace("|", "or")
                .replace("\\", "or")
                .replace("<", "《")
                .replace(">", "》")
                .replace(":", "：")
                .replace("*", "×")
                .replace("?", "？")
                // linux @ # $&()\|; ‘’“”<>
                .replace(" ", "_")
                .replace("$", "＄")
                .replace("&", "and")
                .replace("\t", "_")
                .replace("@", "_")
                .replace("(", "（")
                .replace(")", "）")
                .replace(".", "。")
                .replace("+", "＋")
        ;
        if(title.length() > 123){
            title = title.substring(123);
        }
        return savePath + title + ".txt";
    }
}
